Set up

suppressPackageStartupMessages({
  library(tidyverse)
})

Directories and File Inputs/Outputs

# Detect the ".git" folder -- this will be in the project root directory
# Use this as the root directory to ensure proper sourcing of functions 
# no matter where this is called from
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
analysis_dir <- file.path(root_dir, "analyses", "tmb-vaf-longitudinal")
results_dir <- file.path(analysis_dir, "results")
input_dir <- file.path(analysis_dir, "input")
files_dir <- file.path(root_dir, "analyses", "sample-distribution-analysis", "results")

# Input files
pbta_file <- file.path(files_dir, "pbta.tsv") # file from add-sample-distribution module
genomic_paired_file <- file.path(files_dir, "genomic_assays_matched_time_points.tsv")
tmb_vaf_file <- file.path(results_dir, "tmb_vaf_genomic.tsv")
palette_file <- file.path(root_dir, "figures", "palettes", "oncoprint_color_palette.tsv")

# File path to plot directory
plots_dir <-
  file.path(analysis_dir, "plots")
if (!dir.exists(plots_dir)) {
  dir.create(plots_dir)
}

source(paste0(root_dir, "/figures/scripts/theme.R"))
source(paste0(analysis_dir, "/util/function-create-barplot.R"))

Read in data and process

pbta_df <- readr::read_tsv(pbta_file, guess_max = 100000, show_col_types = FALSE) %>% 
  select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, cg_multiple, cg_id, cgGFAC, tumor_descriptor)

tmb_vaf_df <- readr::read_tsv(tmb_vaf_file, guess_max = 100000, show_col_types = FALSE) %>% 
  filter(!tmb >= 10) %>% 
  select(Kids_First_Biospecimen_ID, Variant_Classification, gene_protein, mutation_count,   region_size, tmb, VAF)

genomic_paired_df <- readr::read_tsv(genomic_paired_file, guess_max = 100000, show_col_types = FALSE) %>%
  left_join(pbta_df, by = c("Kids_First_Participant_ID")) %>% 
  left_join(tmb_vaf_df, by = c("Kids_First_Biospecimen_ID")) %>%
  filter(!is.na(tmb))

# Attention as some bs specimen might not have TMB!
# If that happens, we will end up with samples lacking timepoints.

# Which patient samples don't have TMB?
# genomic_paired_df %>% 
#  filter(is.na(tmb)) %>% 
#  unique() %>% 
#  regulartable() %>%
#  fontsize(size = 12, part = "all")

descriptors_df <- genomic_paired_df %>%
  group_by(Kids_First_Participant_ID) %>%
  summarize(descriptors = paste(sort(tumor_descriptor), collapse = ", "),) 

# Vector to order timepoints
timepoints <- c("Diagnosis", "Progressive", "Recurrence", "Deceased", "Second Malignancy", "Unavailable")

df <- genomic_paired_df %>% 
  left_join(descriptors_df, by = c("Kids_First_Participant_ID", "descriptors")) %>% 
  mutate(td_cgGFAC = case_when(grepl("Deceased", tumor_descriptor) ~ "xDeceased",
                               TRUE ~ tumor_descriptor),
         log10_tmb = abs(log10(tmb)),
         cg_id_kids = paste(cg_id, Kids_First_Participant_ID, sep = "_"),
         cg_id_kids = str_replace(cg_id_kids, "/", "_"),
         cg_id_kids = str_replace(cg_id_kids, "-", "_"),
         cg_id_kids = str_replace_all(cg_id_kids, " ", "_"))

# Let's count #samples per cancer groups and timepoints.
# We will use the cg_id col that indicates cancer type as identified at the first diagnostic sample
timepoint_cg_n_df <- df %>% 
  count(cg_id, tumor_descriptor) %>% 
  dplyr::mutate(timepoint_cg_n = glue::glue("{cg_id}_{tumor_descriptor}  (N={n})")) %>% 
  dplyr::rename(timepoint_cg_number = n) 

# Let's count number of samples per cancer groups and timepoints 
timepoint_cgGFAC_n_df <- df %>% 
  count(cgGFAC, td_cgGFAC) %>% 
  dplyr::mutate(timepoint_cgGFAC_n = glue::glue("{cgGFAC}_{td_cgGFAC}  (N={n})")) %>% 
  dplyr::rename(timepoint_cgGFAC_number = n) 

# Create df to use for plots
df_plot <- df %>% 
  left_join(timepoint_cg_n_df, by = c("tumor_descriptor", "cg_id")) %>%
  left_join(timepoint_cgGFAC_n_df, by = c("td_cgGFAC", "cgGFAC")) %>% 
  filter(!timepoint_cg_n <= 2,
         !timepoint_cgGFAC_n <= 2,
         !cg_id == "NA") %>% 
  mutate(tumor_descriptor = factor(tumor_descriptor),
         tumor_descriptor = fct_relevel(tumor_descriptor, timepoints))
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `tumor_descriptor = fct_relevel(tumor_descriptor, timepoints)`.
Caused by warning:
! 1 unknown level in `f`: Unavailable
# Let's count number of samples 
count_df <- df_plot %>% 
  group_by(tumor_descriptor, cg_id, Kids_First_Biospecimen_ID, Variant_Classification) %>% 
  dplyr::count(cg_id) 

#count_df <- df_plot %>% 
#  dplyr::count(cg_id) %>% 
#  mutate(pct = n / sum(n) * 100)

Define parameters for plots

# Read color palette
palette_df <- readr::read_tsv(palette_file, guess_max = 100000, show_col_types = FALSE) 

# Define and order palette
palette <- palette_df$hex_codes
names(palette) <- palette_df$color_names

Alterations per timepoint

# Define parameters for function
x_value <- count_df$tumor_descriptor
title <- paste("Variant types in PBTA cohort", sep = " ")

# Run function
fname <- paste0(plots_dir, "/", "Alteration_type_timepoints_barplots.pdf")
print(fname)
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/Alteration_type_timepoints_barplots.pdf"
p <- create_stacked_barplot_variant(count_df = count_df, x = x_value, palette = palette, title = title)
pdf(file = fname, width = 6, height = 6)
print(p)
dev.off()
quartz_off_screen 
                2 

Alterations per timepoint in each cancer type

cg_id_id <- as.character(unique(count_df$cg_id)) 
cg_id_id <- sort(cg_id_id, decreasing = FALSE)
cg_id_id
 [1] "Adamantinomatous Craniopharyngioma"       "Atypical Teratoid Rhabdoid Tumor"         "Chordoma"                                
 [4] "Choroid plexus carcinoma"                 "CNS Embryonal tumor"                      "Craniopharyngioma"                       
 [7] "Diffuse midline glioma"                   "Dysembryoplastic neuroepithelial tumor"   "Embryonal tumor with multilayer rosettes"
[10] "Ependymoma"                               "Ewing sarcoma"                            "Ganglioglioma"                           
[13] "Glial-neuronal tumor"                     "Hemangioblastoma"                         "High-grade glioma"                       
[16] "Low-grade glioma"                         "Malignant peripheral nerve sheath tumor"  "Medulloblastoma"                         
[19] "Meningioma"                               "Neuroblastoma"                            "Neurofibroma/Plexiform"                  
[22] "Pilocytic astrocytoma"                    "Rosai-Dorfman disease"                    "Schwannoma"                              
# Define parameters for function
x_value <- count_df$Kids_First_Biospecimen_ID
title <- paste("Variant types in PBTA cohort across cancer groups", sep = " ")

# Run function
p <- create_stacked_barplot_variant_cg_id(count_df = count_df, x = x_value, palette = palette, title = title)

Alterations per timepoint in each cancer type defined by cgGFAC

df_plot_cgGFAC <- df_plot %>%
  filter(!is.na(timepoints_models)) %>% 
  arrange(timepoint_cgGFAC_n) %>% 
  group_by(tumor_descriptor, cgGFAC, timepoint_cgGFAC_n, Kids_First_Biospecimen_ID, Variant_Classification) %>% 
  dplyr::count(timepoint_cgGFAC_n)

cgGFAC_id <- as.character(unique(df_plot_cgGFAC$cgGFAC)) 
cgGFAC_id <- sort(cgGFAC_id, decreasing = FALSE)
cgGFAC_id
[1] "ATRT"  "DMG"   "HGG"   "LGG"   "Other"
# Define parameters for function
x_value <- df_plot_cgGFAC$Kids_First_Biospecimen_ID
title <- paste("Variant types in PBTA cohort across cgGFAC", sep = " ")

# Run function
p <- create_stacked_barplot_variant_cgGFAC(count_df = df_plot_cgGFAC, x = x_value, palette = palette, title = title)

Alterations per timepoint in each cancer type and timepoint model

tm_df_plot <- df_plot %>%
  filter(!is.na(timepoints_models)) %>% 
  group_by(tumor_descriptor, cg_id, timepoints_models, Kids_First_Biospecimen_ID, Variant_Classification) %>% 
  dplyr::count(timepoint_cgGFAC_n)

tm <- as.character(unique(tm_df_plot$timepoints_models))
tm <- sort(tm, decreasing = FALSE)
tm
 [1] "Dx-Dec"         "Dx-Pro"         "Dx-Pro-Dec"     "Dx-Pro-Rec"     "Dx-Pro-Rec-Dec" "Dx-Rec"         "Dx-Rec-Dec"     "Dx-SM"         
 [9] "Pro-Dec"        "Pro-Rec"        "Pro-Rec-Dec"    "Rec-Dec"        "Rec-SM"        
# Loop through variable
for (i in seq_along(tm)){
  print(i)
  df_sub <- tm_df_plot %>%
      filter(timepoints_models == tm[i])
  
   # Define parameters for function
  x_value <- df_sub$Kids_First_Biospecimen_ID
  title <- paste(tm[i])
  
  # Run function
  p <- create_stacked_barplot_variant_cg_id(count_df = df_sub, x = x_value, palette = palette, title = title)
  
}
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13

sessionInfo()
R version 4.2.3 (2023-03-15)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Ventura 13.5.2

Matrix products: default
LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] grid      stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] ggthemes_4.2.4  lubridate_1.9.2 forcats_1.0.0   stringr_1.5.0   dplyr_1.1.2     purrr_1.0.1     readr_2.1.4     tidyr_1.3.0    
 [9] tibble_3.2.1    ggplot2_3.4.2   tidyverse_2.0.0

loaded via a namespace (and not attached):
 [1] tidyselect_1.2.0  xfun_0.39         bslib_0.5.0       carData_3.0-5     colorspace_2.1-0  vctrs_0.6.3       generics_0.1.3   
 [8] htmltools_0.5.5   yaml_2.3.7        utf8_1.2.3        rlang_1.1.1       pillar_1.9.0      jquerylib_0.1.4   ggpubr_0.6.0     
[15] glue_1.6.2        withr_2.5.0       bit64_4.0.5       lifecycle_1.0.3   munsell_0.5.0     ggsignif_0.6.4    gtable_0.3.3     
[22] ragg_1.2.5        evaluate_0.21     labeling_0.4.2    knitr_1.43        tzdb_0.4.0        fastmap_1.1.1     parallel_4.2.3   
[29] fansi_1.0.4       broom_1.0.5       scales_1.2.1      backports_1.4.1   cachem_1.0.8      vroom_1.6.3       jsonlite_1.8.7   
[36] abind_1.4-5       systemfonts_1.0.4 farver_2.1.1      bit_4.0.5         textshaping_0.3.6 hms_1.1.3         digest_0.6.33    
[43] stringi_1.7.12    rstatix_0.7.2     rprojroot_2.0.3   cli_3.6.1         tools_4.2.3       magrittr_2.0.3    sass_0.4.7       
[50] crayon_1.5.2      car_3.1-2         pkgconfig_2.0.3   timechange_0.2.0  rmarkdown_2.23    rstudioapi_0.15.0 R6_2.5.1         
[57] compiler_4.2.3   
---
title: "Classification of Variants across paired longitudinal samples in the PBTA Cohort"
author: 'Antonia Chroni <chronia@chop.edu> for D3B'
date: "2023"
output:
  html_notebook:
    toc: TRUE
    toc_float: TRUE
---

# Set up
```{r load-library}
suppressPackageStartupMessages({
  library(tidyverse)
})
```

# Directories and File Inputs/Outputs
```{r set-dir-and-file-names}
# Detect the ".git" folder -- this will be in the project root directory
# Use this as the root directory to ensure proper sourcing of functions 
# no matter where this is called from
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
analysis_dir <- file.path(root_dir, "analyses", "tmb-vaf-longitudinal")
results_dir <- file.path(analysis_dir, "results")
input_dir <- file.path(analysis_dir, "input")
files_dir <- file.path(root_dir, "analyses", "sample-distribution-analysis", "results")

# Input files
pbta_file <- file.path(files_dir, "pbta.tsv") # file from add-sample-distribution module
genomic_paired_file <- file.path(files_dir, "genomic_assays_matched_time_points.tsv")
tmb_vaf_file <- file.path(results_dir, "tmb_vaf_genomic.tsv")
palette_file <- file.path(root_dir, "figures", "palettes", "oncoprint_color_palette.tsv")

# File path to plot directory
plots_dir <-
  file.path(analysis_dir, "plots")
if (!dir.exists(plots_dir)) {
  dir.create(plots_dir)
}

source(paste0(root_dir, "/figures/scripts/theme.R"))
source(paste0(analysis_dir, "/util/function-create-barplot.R"))
```

# Read in data and process

```{r load-process-inputs}
pbta_df <- readr::read_tsv(pbta_file, guess_max = 100000, show_col_types = FALSE) %>% 
  select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, cg_multiple, cg_id, cgGFAC, tumor_descriptor)

tmb_vaf_df <- readr::read_tsv(tmb_vaf_file, guess_max = 100000, show_col_types = FALSE) %>% 
  filter(!tmb >= 10) %>% 
  select(Kids_First_Biospecimen_ID, Variant_Classification, gene_protein, mutation_count,	region_size, tmb, VAF)

genomic_paired_df <- readr::read_tsv(genomic_paired_file, guess_max = 100000, show_col_types = FALSE) %>%
  left_join(pbta_df, by = c("Kids_First_Participant_ID")) %>% 
  left_join(tmb_vaf_df, by = c("Kids_First_Biospecimen_ID")) %>%
  filter(!is.na(tmb))

# Attention as some bs specimen might not have TMB!
# If that happens, we will end up with samples lacking timepoints.

# Which patient samples don't have TMB?
# genomic_paired_df %>% 
#  filter(is.na(tmb)) %>% 
#  unique() %>% 
#  regulartable() %>%
#  fontsize(size = 12, part = "all")

descriptors_df <- genomic_paired_df %>%
  group_by(Kids_First_Participant_ID) %>%
  summarize(descriptors = paste(sort(tumor_descriptor), collapse = ", "),) 

# Vector to order timepoints
timepoints <- c("Diagnosis", "Progressive", "Recurrence", "Deceased", "Second Malignancy", "Unavailable")

# Create df to use for plots
df_plot <- genomic_paired_df %>% 
  left_join(descriptors_df, by = c("Kids_First_Participant_ID", "descriptors")) %>% 
  mutate(td_cgGFAC = case_when(grepl("Deceased", tumor_descriptor) ~ "xDeceased",
                               TRUE ~ tumor_descriptor),
         log10_tmb = abs(log10(tmb)),
         cg_id_kids = paste(cg_id, Kids_First_Participant_ID, sep = "_"),
         cg_id_kids = str_replace(cg_id_kids, "/", "_"),
         cg_id_kids = str_replace(cg_id_kids, "-", "_"),
         cg_id_kids = str_replace_all(cg_id_kids, " ", "_"),
         tumor_descriptor = factor(tumor_descriptor),
         tumor_descriptor = fct_relevel(tumor_descriptor, timepoints))

# Let's count number of samples 
count_df <- df_plot %>% 
  group_by(tumor_descriptor, cg_id, Kids_First_Biospecimen_ID, Variant_Classification) %>% 
  dplyr::count(cg_id) 

``` 

# Define parameters for plots

```{r define-parameters-for-plots}
# Read color palette
palette_df <- readr::read_tsv(palette_file, guess_max = 100000, show_col_types = FALSE) 

# Define and order palette
palette <- palette_df$hex_codes
names(palette) <- palette_df$color_names

```

# Alterations per timepoint

```{r plot-timepoint, fig.width = 6, fig.height = 6, fig.fullwidth = TRUE}
# Define parameters for function
x_value <- count_df$tumor_descriptor
title <- paste("Variant types in PBTA cohort", sep = " ")

# Run function
fname <- paste0(plots_dir, "/", "Alteration_type_timepoints_barplots.pdf")
print(fname)
p <- create_stacked_barplot_variant(count_df = count_df, x = x_value, palette = palette, title = title)
pdf(file = fname, width = 6, height = 6)
print(p)
dev.off()
```

# Alterations per timepoint in each cancer type

```{r plot-cg-id, fig.width = 30, fig.height = 18, fig.fullwidth = TRUE}
cg_id_id <- as.character(unique(count_df$cg_id)) 
cg_id_id <- sort(cg_id_id, decreasing = FALSE)
cg_id_id

# Define parameters for function
x_value <- count_df$Kids_First_Biospecimen_ID
title <- paste("Variant types in PBTA cohort across cancer groups", sep = " ")

# Run function
p <- create_stacked_barplot_variant_cg_id(count_df = count_df, x = x_value, palette = palette, title = title)

```

# Alterations per timepoint in each cancer type defined by cgGFAC

```{r plot-cgGFAC-n-individual-plots, fig.width = 5, fig.height = 5, fig.fullwidth = TRUE}
df_plot_cgGFAC <- df_plot %>%
  filter(!is.na(timepoints_models)) %>% 
  arrange(timepoint_cgGFAC_n) %>% 
  group_by(tumor_descriptor, cgGFAC, timepoint_cgGFAC_n, Kids_First_Biospecimen_ID, Variant_Classification) %>% 
  dplyr::count(timepoint_cgGFAC_n)

cgGFAC_id <- as.character(unique(df_plot_cgGFAC$cgGFAC)) 
cgGFAC_id <- sort(cgGFAC_id, decreasing = FALSE)
cgGFAC_id

# Define parameters for function
x_value <- df_plot_cgGFAC$Kids_First_Biospecimen_ID
title <- paste("Variant types in PBTA cohort across cgGFAC", sep = " ")

# Run function
p <- create_stacked_barplot_variant_cgGFAC(count_df = df_plot_cgGFAC, x = x_value, palette = palette, title = title)

```

# Alterations per timepoint in each cancer type and timepoint model

```{r plot-timepoint-model, fig.width = 15, fig.height = 12, fig.fullwidth = TRUE}
tm_df_plot <- df_plot %>%
  filter(!is.na(timepoints_models)) %>% 
  group_by(tumor_descriptor, cg_id, timepoints_models, Kids_First_Biospecimen_ID, Variant_Classification) %>% 
  dplyr::count(timepoint_cgGFAC_n)

tm <- as.character(unique(tm_df_plot$timepoints_models))
tm <- sort(tm, decreasing = FALSE)
tm

# Loop through variable
for (i in seq_along(tm)){
  print(i)
  df_sub <- tm_df_plot %>%
      filter(timepoints_models == tm[i])
  
   # Define parameters for function
  x_value <- df_sub$Kids_First_Biospecimen_ID
  title <- paste(tm[i])
  
  # Run function
  p <- create_stacked_barplot_variant_cg_id(count_df = df_sub, x = x_value, palette = palette, title = title)
  
}
```


```{r echo=TRUE}
sessionInfo()
```
